<h1 align="center"> Machine learning-based prediction of early recurrence in glioblastoma patients: a glance towards precision medicine <br><br> [Statistical Analysis]</h1>

<h2>[1] Library</h2>

In [None]:
# OS library
import os
import sys
import argparse
import random
from math import sqrt

# Analysis
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.proportion import proportion_confint

import pingouin as pg
%matplotlib inline

<h2>[2] Data Preprocessing</h2>

<h4>[-] Load the database</h4>

In [None]:
file = os.path.join(sys.path[0], "db.xlsx")
db = pd.read_excel(file)

print("N° of patients: {}".format(len(db)))
print("N° of columns: {}".format(db.shape[1]))
db.head()

<h4>[-] Drop unwanted columns + create <i>'results'</i> column</h4>

In [None]:
df = db.drop(['Name_Surname','SURVIVAL', 'OS', '...'], axis = 'columns')

print("Effective features to consider: {} ".format(len(df.columns)-1))
print("Creating 'result' column...")

# 0 = No relapse
df.loc[df['PFS'] > 6, 'outcome'] = 0

# 1 = Early relapse (within 6 months)
df.loc[df['PFS'] <= 6, 'outcome'] = 1

<h2>[3] Count and Frequency</h2>

In [None]:
df.groupby(['outcome', '...']).count()

In [None]:
df['...'].describe()

<h2>[4] Statistical Association</h2>
<ul>
    <li>Levene's test is an inferential statistic used to assess the equality of variances for a variable calculated for two or more groups. If p-value >> 0.05, no difference in variances between the groups</li>
    <li>F-one way ANOVA test is performed if the variance is the same</li>
</ul>

In [None]:
non_early = df[df['outcome'] == 0]['...']
early_relapse = df[df['outcome'] == 1]['...']

print(non_early.shape)
print(stats.levene(non_early, early_relapse))
print(stats.f_oneway(non_early, early_relapse))

## Change equal_var to False if Levene p-value is below 0.05
print(stats.ttest_ind(non_early, early_relapse, equal_var=True))

In [None]:
sex_ct = pd.crosstab(df['...'], df['outcome'])
print("--- *** Contingency Table *** --- \n",sex_ct)

print("\n--- *** Chi-Square *** ---")
stat, p, dof, expected = stats.chi2_contingency(sex_ct, correction = False)
print("DOF=%d" % dof)
print("Expected values = ", expected)
print("p-value = ", p)
print("stat = ", stat)

prob = 0.95
critical = stats.chi2.ppf(prob, dof)
if abs(stat) >= critical:
    print('\nDependent (reject H0), [Critical: {}]'.format(critical))
else:
    print('\nIndependent (fail to reject H0), [Critical: {}]'.format(critical))

<h4>[-] Holm-Bonferroni correction</h4>

In [None]:
pvals = [...]
significant, adjusted = pg.multicomp(pvals, alpha=0.05, method='holm')
tab = {'Uncorrected':pvals, 'Adjusted':adjusted, 'Significant':significant}
df = pd.DataFrame(tab)
df

<h2>[5] Multivariable Analysis</h2>

<h4>[-] Label encoding</h4>

In [None]:
dummy_v = ['localization', '...']
df = pd.get_dummies(df, columns = dummy_v, prefix = dummy_v)
df[['..']].astype(float)
df.head(5)

In [None]:
cols_to_keep = ['...']
data = df[cols_to_keep]

# manually add the intercept
data['intercept'] = 1.0
data.head()
data.columns

In [None]:
train_cols = ['...']
logit = sm.Logit(data['outcome'], data[train_cols], missing = 'drop')
result = logit.fit()

In [None]:
result.summary(alpha = 0.05)

In [None]:
coef = result.params
p = result.pvalues
conf = result.conf_int(alpha = 0.05)

conf['OR'] = coef
conf.columns = ['2.5%', '97.5%', 'OR']

conf = np.exp(conf)
conf['p-value'] = p

<h4>[-] Export Multivariable as Excel file</h4>

In [None]:
conf.to_excel("multivariable.xlsx")